In [35]:
## 사전실행코드
import polars as pl
df_spotify = (pl.read_csv("./universal_top_spotify_songs.csv", try_parse_dates = True,
null_values = [""])
    .filter(pl.col('snapshot_date').dt.year() == 2024).sort('snapshot_date'))

df_spotify = (
    df_spotify.with_columns(pl.when(pl.col('country').is_null() == True) ## country 열이 null이면
        .then(pl.lit('WW')) ## WW로 변경
        .otherwise(pl.col('country')).alias('country')) ## 아니면 원래대로
    .drop_nulls()) ## 그 외 null이 들어간 행 삭제

key_levels = pl.Enum(["C", "C#", "D", "Eb", "E", "F", "F#", "G", "G#", "A", "Bb", "B"])

df_spotify = (df_spotify.with_columns(pl.col('key').cast(pl.String)
    .replace(["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"],
        ["C", "C#", "D", "Eb", "E", "F", "F#", "G", "G#", "A", "Bb", "B"]))
    .with_columns(pl.col('key').cast(key_levels)).sort('key'))

df_spotify = (df_spotify.with_columns(pl.col('artists').str.split(', ')) ## ,를 기준으로 문자열을 분리
    ## 리스트의 첫 번째 아이템을 가져와서 main_vocal로 저장
    .with_columns(pl.col('artists').list.get(0, null_on_oob = True).alias('main_vocal'),
        pl.col('artists').list.tail(-1).alias('featuring')) ## 첫 번째 아이템을 제외한 나머지를featuring으로 저장
    .with_columns(pl.when(pl.col('featuring').list.len() == 0) ## 리스트 길이가 0이면
        .then(None) ## None으로 설정
        .otherwise(pl.col('featuring')).name.keep())) ## 아니면 그대로 유지

import pycountry_convert as pc

def get_continent_name(nation_code: str) -> str:
    if nation_code != 'WW':
        continent_code = pc.country_alpha2_to_continent_code(nation_code)
    else:
        continent_code = 'WW'
    continent_dict = {"NA": "North America","SA": "South America", "AS": "Asia", "AF": "Africa",
        "OC": "Oceania", "EU": "Europe", "AQ": "Antarctica", "WW": "Global"}
    return continent_dict[continent_code]

df_spotify = (df_spotify.with_columns(pl.col('country')
    ## 앞서 정의한 함수를 country 열에 적용
    .map_elements(get_continent_name, return_dtype = pl.String).alias('continent')))
In [36]:
pl.Config(set_tbl_cols = 10, set_tbl_rows = 25)
(df_spotify.select(pl.col([pl.Int64, pl.Float64])).describe()
    .transpose(include_header = True, header_name = 'columns',
column_names = ["count", "null_count", "mean", "std", "min", "25%", "50%", "75%", "max"])[1:])
Out[36]:
shape: (16, 10)
columnscountnull_countmeanstdmin25%50%75%max
strstrstrstrstrstrstrstrstrstr
"daily_rank""1281585.0""0.0""25.488521635318765""14.428953828238736""1.0""13.0""25.0""38.0""50.0"
"daily_movement""1281585.0""0.0""0.9522794040192418""7.048585204849036""-49.0""-1.0""0.0""2.0""49.0"
"weekly_movement""1281585.0""0.0""2.6433954829371444""11.958440767017487""-49.0""-3.0""0.0""5.0""49.0"
"popularity""1281585.0""0.0""75.99977215713355""15.612366849702711""0.0""65.0""79.0""88.0""100.0"
"duration_ms""1281585.0""0.0""192147.9038237807""49510.28735642052""16320.0""160413.0""184250.0""216338.0""939666.0"
"danceability""1281585.0""0.0""0.6840186434766325""0.13658257971471674""0.0""0.594""0.703""0.785""0.988"
"energy""1281585.0""0.0""0.6552570154470442""0.1615410243104661""0.0000201""0.557""0.673""0.768""0.998"
"loudness""1281585.0""0.0""-6.383302003378629""2.551654916926727""-37.334""-7.754""-5.952""-4.668""3.233"
"mode""1281585.0""0.0""0.5399329736225065""0.49840300103880864""0.0""0.0""1.0""1.0""1.0"
"speechiness""1281585.0""0.0""0.09353411915713745""0.08890688930938012""0.0""0.0391""0.0574""0.108""0.937"
"acousticness""1281585.0""0.0""0.2664934433930094""0.24594449685538267""3.45e-6""0.0638""0.183""0.431""0.996"
"instrumentalness""1281585.0""0.0""0.01710200619257404""0.0913520037990717""0.0""0.0""1.2e-6""0.0000754""0.977"
"liveness""1281585.0""0.0""0.16931368773823038""0.12396759653567327""0.0139""0.0957""0.12""0.201""0.978"
"valence""1281585.0""0.0""0.5548212007865261""0.2268662283516568""0.0""0.374""0.559""0.739""0.992"
"tempo""1281585.0""0.0""122.27016181993389""28.035717348594673""0.0""100.019""119.96""140.082""236.089"
"time_signature""1281585.0""0.0""3.907079124677645""0.387560985392341""0.0""4.0""4.0""4.0""5.0"
In [37]:
## plotly가 주피터 노트북, 주피터 랩에서 표시되지 않는 경우 아래의 코드를 실행시키세요
import plotly.io as pio
pio.renderers.default = "notebook_connected"

import plotly.express as px
fig = px.histogram(df_spotify, x = 'daily_rank')
fig.show()
In [38]:
fig = px.histogram(df_spotify, x = 'daily_movement')
fig.show()
In [39]:
fig = px.histogram(df_spotify, x = 'weekly_movement')
fig.show()
In [40]:
fig = px.histogram(df_spotify, x = 'popularity')
fig.show()
In [41]:
fig = px.pie(df_spotify.group_by('is_explicit').len('count'), values = "count")
fig.update_traces(textinfo = "percent+label")
fig.show()
In [42]:
pl.Config(set_tbl_cols = 10, set_tbl_rows = 25)
(df_spotify.select(pl.col([pl.String, pl.List(pl.String)])). describe()
    .transpose(include_header = True, header_name = 'columns',
column_names = ['count', 'null_count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'])[1:])
Out[42]:
shape: (8, 10)
columnscountnull_countmeanstdmin25%50%75%max
strstrstrstrstrstrstrstrstrstr
"spotify_id""1281585""0"nullnull"003vvx7Niy0yvhvHt4a68B"nullnullnull"7zyWm8JihcIiYmfNkbzeHE"
"name""1281585""0"nullnull""Se""nullnullnull"한 페이지가 될 수 있게"
"artists""1281585.0""0.0"nullnullnullnullnullnullnull
"country""1281585""0"nullnull"AE"nullnullnull"ZA"
"album_name""1281585""0"nullnull""Cold Sweats""nullnullnull"黑玻璃"
"main_vocal""1281585""0"nullnull"$$Double-Dolla$$"nullnullnull"高爾宣 OSN"
"featuring""519897.0""761688.0"nullnullnullnullnullnullnull
"continent""1281585""0"nullnull"Africa"nullnullnull"South America"
In [43]:
df_spotify.select(pl.col([pl.String]).unique().len())
Out[43]:
shape: (1, 6)
spotify_idnamecountryalbum_namemain_vocalcontinent
u32u32u32u32u32u32
1636014550731108657707
In [44]:
fig = px.bar((df_spotify.group_by('country').agg(pl.len().alias('count'))
    .with_columns(pl.when(pl.col('country') == "KR").then(pl.lit("KR"))
        .when(pl.col('country') == "WW").then(pl.lit("Global")).otherwise(pl.lit("Others")).
alias('국가'))),
    x = 'country', y = 'count', color = '국가')
fig.update_xaxes(categoryorder = "total descending")
fig.show()
In [45]:
df_spotify.group_by('country').len().sort('len', descending = True)
Out[45]:
shape: (73, 2)
countrylen
stru32
"IT"17720
"DO"17718
"NI"17715
"CZ"17709
"PL"17709
"CR"17708
"SV"17708
"HN"17708
"EG"17708
"FI"17707
"HU"17706
"KZ"17706
"TH"17706
……
"IN"17454
"MA"17454
"LV"17448
"CA"17447
"US"17387
"UA"17360
"NZ"17295
"LU"17281
"VN"16954
"UY"16952
"VE"16808
"GB"16682
In [46]:
fig = px.bar(df_spotify.group_by('continent').agg(pl.col('country').unique().len()),
    x = 'continent', y = 'country', text = 'country')
fig.update_xaxes(categoryorder = "total descending")
fig.show()
In [47]:
(df_spotify.group_by('continent').agg(pl.col('country').unique().len()).sort('country',
descending = True))
Out[47]:
shape: (7, 2)
continentcountry
stru32
"Europe"29
"Asia"17
"North America"10
"South America"10
"Africa"4
"Oceania"2
"Global"1